Rem
Rem $Header: rdbms/demo/dmglrdem.sql /main/14 2012/06/27 11:54:27 jyarmus Exp $
Rem
Rem dmglrdem.sql
Rem
Rem Copyright (c) 2003, 2012, Oracle and/or its affiliates. 
Rem All rights reserved. 
Rem
Rem    NAME
Rem      dmglrdem.sql - Sample program for the DBMS_DATA_MINING package.
Rem
Rem    DESCRIPTION
Rem      This script creates a regression model using GLM based on
Rem      data in the SH (Sales History) schema in the RDBMS and
Rem      demonstrates the use of SQL prediction functions to test
Rem      the model and apply it to new data.
Rem
Rem    NOTES
Rem     
Rem
Rem    MODIFIED   (MM/DD/YY)
Rem    jyarmus     05/31/12 - bug 14112868
Rem    amozes      01/26/12 - updates for 12c
Rem    xbarr       01/09/12 - add prediction_details
Rem    ramkrish    06/14/07 - remove commit after settings
Rem    ramkrish    01/22/08 - add prediction_bounds
Rem    jyarmus     07/20/07 - add feature selection and generation settings
Rem    ramkrish    10/25/07 - replace deprecated get_model calls with catalog
Rem                           queries
Rem    bmilenov    05/08/07 - Change case_id column in row diagnostics table
Rem    bmilenov    12/13/06 - Turn ADP on
Rem    bmilenov    12/05/06 - Change row diagnostic query
Rem    jyarmus     11/21/06 - Change ridge regression constant
Rem    jyarmus     10/13/06 - drop diagnostics table prior to second model
Rem                           build
Rem    jyarmus     10/12/06 - build with and without columns causing 
Rem                           a singularity in covariance matrix    
Rem    bmilenov    10/04/06 - Remove multicolinear columns
Rem    bmilenov    08/22/06 - Remove filtered view
Rem    bmilenov    08/18/06 - Cleanup
Rem    ramkrish    10/02/03 - Creation
  
SET serveroutput ON
SET trimspool ON  
SET pages 10000
SET echo ON

-----------------------------------------------------------------------
--                            SAMPLE PROBLEM
-----------------------------------------------------------------------
-- Given demographic, purchase, and affinity card membership data for a 
-- set of customers, predict customer's age. Since age is a continuous 
-- variable, this is a regression problem.

-----------------------------------------------------------------------
--                            SET UP AND ANALYZE DATA
-----------------------------------------------------------------------

-- The data for this sample is composed from base tables in the SH Schema
-- (See Sample Schema Documentation) and presented through these views:
-- mining_data_build_v (build data)
-- mining_data_test_v  (test data)
-- mining_data_apply_v (apply data)
-- (See dmsh.sql for view definitions).
--
-----------
-- ANALYSIS
-----------
-- Data preparation for GLM is performed internally

-----------------------------------------------------------------------
--                            BUILD THE MODEL
-----------------------------------------------------------------------

-- Cleanup old model with same name (if any)
BEGIN
  DBMS_DATA_MINING.DROP_MODEL('GLMR_SH_Regr_sample');
EXCEPTION WHEN OTHERS THEN NULL; END;
/

-------------------
-- SPECIFY SETTINGS
--
-- Cleanup old settings table for repeat runs
BEGIN EXECUTE IMMEDIATE 'DROP TABLE glmr_sh_sample_settings';
EXCEPTION WHEN OTHERS THEN NULL; END;
/

-- Cleanup diagnostic table
BEGIN EXECUTE IMMEDIATE 'DROP TABLE glmr_sh_sample_diag';
EXCEPTION WHEN OTHERS THEN NULL; END;
/

CREATE TABLE glmr_sh_sample_settings (
  setting_name  VARCHAR2(30),
  setting_value VARCHAR2(4000));

-- Turn on feature selection and generation
--  
BEGIN 
-- Populate settings table
  INSERT INTO glmr_sh_sample_settings (setting_name, setting_value) VALUES
    (dbms_data_mining.algo_name, dbms_data_mining.algo_generalized_linear_model);
  -- output row diagnostic statistics into a table named GLMC_SH_SAMPLE_DIAG  
  INSERT INTO  glmr_sh_sample_settings (setting_name, setting_value) VALUES
    (dbms_data_mining.glms_diagnostics_table_name, 'GLMR_SH_SAMPLE_DIAG');  
  INSERT INTO  glmr_sh_sample_settings (setting_name, setting_value) VALUES
    (dbms_data_mining.prep_auto, dbms_data_mining.prep_auto_on);  
  -- turn on feature selection
  INSERT INTO  glmr_sh_sample_settings (setting_name, setting_value) VALUES  
    (dbms_data_mining.glms_ftr_selection, 
    dbms_data_mining.glms_ftr_selection_enable);
  -- turn on feature generation
  INSERT INTO  glmr_sh_sample_settings (setting_name, setting_value) VALUES 
    (dbms_data_mining.glms_ftr_generation, 
    dbms_data_mining.glms_ftr_generation_enable); 
  
  /* Examples of possible overrides are shown below. If the user does not
     override, then relevant settings are determined by the algorithm
      
  -- specify a row weight column 
    (dbms_data_mining.odms_row_weight_column_name,<row_weight_column_name>);
  -- specify a missing value treatment method:
     Default:  replace with mean (numeric features) or 
               mode (categorical features) 
       or delete the row
    (dbms_data_mining.odms_missing_value_treatment,
      dbms_data_mining.odms_missing_value_delete_row);
  -- turn ridge regression on or off 
     By default the system turns it on if there is a multicollinearity
    (dbms_data_mining.glms_ridge_regression,
     dbms_data_mining.glms_ridge_reg_enable);  
  */  
END;
/ 

---------------------
-- CREATE A NEW MODEL
--
-- Force the column affinity_card to be a feature in the model using
-- dbms_data_mining_transform
--
declare
  v_xlst dbms_data_mining_transform.TRANSFORM_LIST;
BEGIN
  -- Force the column affinity_card to be a feature in the model
  dbms_data_mining_transform.set_transform(v_xlst,
    'AFFINITY_CARD', NULL, 'AFFINITY_CARD', 'AFFINITY_CARD', 'FORCE_IN');
  DBMS_DATA_MINING.CREATE_MODEL(
    model_name          => 'GLMR_SH_Regr_sample',
    mining_function     => dbms_data_mining.regression,
    data_table_name     => 'mining_data_build_v',
    case_id_column_name => 'cust_id',
    target_column_name  => 'age',
    settings_table_name => 'glmr_sh_sample_settings',
    xform_list          => v_xlst);
END;
/

-------------------------
-- DISPLAY MODEL SETTINGS
--
column setting_name format a30
column setting_value format a30
SELECT setting_name, setting_value
  FROM user_mining_model_settings
 WHERE model_name = 'GLMR_SH_REGR_SAMPLE'
ORDER BY setting_name;

--------------------------
-- DISPLAY MODEL SIGNATURE
--
column attribute_name format a40
column attribute_type format a20
SELECT attribute_name, attribute_type
  FROM user_mining_model_attributes
 WHERE model_name = 'GLMR_SH_REGR_SAMPLE'
ORDER BY attribute_name;

------------------------
-- DISPLAY MODEL DETAILS
--
-- Global statistics
SELECT *
  FROM TABLE(dbms_data_mining.get_model_details_global('GLMR_SH_Regr_sample'))
ORDER BY global_detail_name;

-- IF the covariance matrix had been invalid, THEN the global details 
-- would have had a row like:
--
--   VALID_COVARIANCE_MATRIX  0 
-- 
-- And, as a result we would only have gotten a limited set of diagnostics.
-- This never happens with feature selecion enabled. IF the forced in feature 
-- had caused a multi-collinearity then the model build would have failed.
-- Note that the forced_in feature, age, was not statistically significant.
-- However, it did not cause a multi-collinearity, hence the build succeeded.
--
-- With feature selection disabled, then an invalid covariance matrix is 
-- possible. Then, multi-collinearity, if it exists will cause 
-- RIDGE REGRESSION to kick in, unless it has been specifically disabled by 
-- you. The build will succeed with ridge enabled. However, the covariance 
-- matrix will be invalid since it is not computed by the ridge algorithm.
-- 
-- An important consequence of an invalid covariance matrix is that
-- the model cannot predict confidence bounds - i.e. the result of
-- PREDICTION_BOUNDS function in a SQL query is NULL.
--
-- If accuracy is the primary goal and interpretability not important, then we
-- note that RIDGE REGRESSION may be preferrable to feature selection for 
-- some datasets. You can test this by specifically enabling ridge regression.
-- 
-- In this demo, we compute two models. The first (above) has feature
-- selection enabled and produces confidence bounds and a full set of 
-- diagnostics. The second enables ridge regression. It does not produce 
-- confidence bounds. It only produces a limited set of diagnostics.

-- Coefficient statistics
SET line 120
column feature_expression format a53   
  
SELECT feature_expression, coefficient, std_error, test_statistic,
  p_value, std_coefficient, lower_coeff_limit, upper_coeff_limit
  FROM TABLE(dbms_data_mining.get_model_details_glm('GLMR_SH_Regr_sample'))
  WHERE feature_expression = 'AFFINITY_CARD';

-- Show the features and their p_values
SET lin 80
SET pages 20
SELECT feature_expression, coefficient, p_value 
  FROM TABLE(dbms_data_mining.get_model_details_glm('GLMR_SH_Regr_sample'))
  ORDER BY p_value;

-- Row diagnostics
SELECT *
  FROM glmr_sh_sample_diag 
 WHERE case_id <= 101510
 ORDER BY case_id;

-----------------------------------------------------------------------
--                               TEST THE MODEL
-----------------------------------------------------------------------

------------------------------------
-- COMPUTE METRICS TO TEST THE MODEL
--

-- 1. Root Mean Square Error - Sqrt(Mean((x - x')^2))
-- 2. Mean Absolute Error - Mean(|(x - x')|)
--
column rmse format 9999.99
column mae format 9999.99  
SELECT SQRT(AVG((A.pred - B.age) * (A.pred - B.age))) rmse,
       AVG(ABS(a.pred - B.age)) mae
  FROM (SELECT cust_id, prediction(GLMR_SH_Regr_sample using *) pred
          FROM mining_data_test_v) A,
       mining_data_test_v B
  WHERE A.cust_id = B.cust_id;

-----------------------------------------------------------------------
--                               SCORE NEW DATA
-----------------------------------------------------------------------

-- Since the model has a valid covariance matrix, it is possible
-- to obtain confidence bounds.
-- In addition, provide details to help explain the prediction
set long 20000
set line 200
set pagesize 100
SELECT CUST_ID,
       PREDICTION(GLMR_SH_Regr_sample USING *) pr,
       PREDICTION_BOUNDS(GLMR_SH_Regr_sample USING *).lower pl,
       PREDICTION_BOUNDS(GLMR_SH_Regr_sample USING *).upper pu,
       PREDICTION_DETAILS(GLMR_SH_Regr_sample USING *) pd
  FROM mining_data_apply_v
 WHERE CUST_ID < 100010
 ORDER BY CUST_ID;

-- Next we compare to a model built with ridge regression enabled


-----------------------------------------------------------------------
--                            BUILD A NEW MODEL
-----------------------------------------------------------------------

-- Cleanup diagnostic table
BEGIN EXECUTE IMMEDIATE 'DROP TABLE glmr_sh_sample_diag';
EXCEPTION WHEN OTHERS THEN NULL; END;
/

---------------------
-- CREATE A NEW MODEL
--
-- Cleanup old model with same name (if any)
BEGIN
  DBMS_DATA_MINING.DROP_MODEL('GLMR_SH_Regr_sample');
EXCEPTION WHEN OTHERS THEN
  NULL;
END;
/

-- SPECIFY SETTINGS
--
-- Cleanup old settings table 
BEGIN EXECUTE IMMEDIATE 'DROP TABLE glmr_sh_sample_settings';
EXCEPTION WHEN OTHERS THEN NULL; END;
/

CREATE TABLE glmr_sh_sample_settings (
  setting_name  VARCHAR2(30),
  setting_value VARCHAR2(4000));

-- Turn on ridge regression
--  
BEGIN 
-- Populate settings table
  INSERT INTO glmr_sh_sample_settings (setting_name, setting_value) VALUES
    (dbms_data_mining.algo_name, dbms_data_mining.algo_generalized_linear_model);
  -- output row diagnostic statistics into a table named GLMC_SH_SAMPLE_DIAG  
  INSERT INTO  glmr_sh_sample_settings (setting_name, setting_value) VALUES
    (dbms_data_mining.glms_diagnostics_table_name, 'GLMR_SH_SAMPLE_DIAG');  
  INSERT INTO  glmr_sh_sample_settings (setting_name, setting_value) VALUES
    (dbms_data_mining.prep_auto, dbms_data_mining.prep_auto_on);  
  -- turn on ridge regression
  INSERT INTO  glmr_sh_sample_settings (setting_name, setting_value) VALUES  
    (dbms_data_mining.glms_ridge_regression,
    dbms_data_mining.glms_ridge_reg_enable);  
END;
/

BEGIN
  DBMS_DATA_MINING.CREATE_MODEL(
    model_name          => 'GLMR_SH_Regr_sample',
    mining_function     => dbms_data_mining.regression,
    data_table_name     => 'mining_data_build_v',
    case_id_column_name => 'cust_id',
    target_column_name  => 'age',
    settings_table_name => 'glmr_sh_sample_settings');
END;
/

-------------------------
-- DISPLAY MODEL SETTINGS
--
column setting_name format a30
column setting_value format a30
SELECT setting_name, setting_value
  FROM user_mining_model_settings
 WHERE model_name = 'GLMR_SH_REGR_SAMPLE'
ORDER BY setting_name;

--------------------------
-- DISPLAY MODEL SIGNATURE
--
column attribute_name format a40
column attribute_type format a20
SELECT attribute_name, attribute_type
  FROM user_mining_model_attributes
 WHERE model_name = 'GLMR_SH_REGR_SAMPLE'
ORDER BY attribute_name;

------------------------
-- DISPLAY MODEL DETAILS
--
-- Global statistics
SELECT *
  FROM TABLE(dbms_data_mining.get_model_details_global(
              'GLMR_SH_Regr_sample'))
ORDER BY global_detail_name;

-- Coefficient statistics
SET line 120
column class format a20
column attribute_name format a20  
column attribute_subname format a20
column attribute_value format a20  
  
SELECT *
  FROM TABLE(dbms_data_mining.get_model_details_glm('GLMR_SH_Regr_sample'))
 WHERE attribute_name in ('AFFINITY_CARD','BULK_PACK_DISKETTES','COUNTRY_NAME')
ORDER BY class, attribute_name, attribute_value;
    
-- Limited row diagnostics - residuals only, others are NULL
SELECT *
  FROM glmr_sh_sample_diag 
 WHERE case_id <= 101510
 ORDER BY case_id;

-----------------------------------------------------------------------
--                               TEST THE MODEL
-----------------------------------------------------------------------

------------------------------------
-- COMPUTE METRICS TO TEST THE MODEL
--

-- 1. Root Mean Square Error - Sqrt(Mean((x - x')^2))
-- 2. Mean Absolute Error - Mean(|(x - x')|)
--
column rmse format 9999.99
column mae format 9999.99  
SELECT SQRT(AVG((A.pred - B.age) * (A.pred - B.age))) rmse,
       AVG(ABS(a.pred - B.age)) mae
  FROM (SELECT cust_id, prediction(GLMR_SH_Regr_sample using *) pred
          FROM mining_data_test_v) A,
       mining_data_test_v B
 WHERE A.cust_id = B.cust_id;

-- 3. Residuals
--    If the residuals show substantial variance between
--    the predicted value and the actual, you can consider
--    changing the algorithm parameters.
--
SELECT TO_CHAR(ROUND(pred, 4)) prediction, residual
  FROM (SELECT A.pred, (A.pred - B.age) residual
          FROM (SELECT cust_id, prediction(GLMR_SH_Regr_sample using *) pred
                  FROM mining_data_test_v) A,
               mining_data_test_v B
         WHERE A.cust_id = B.cust_id
        ORDER BY A.pred ASC)
 WHERE pred <= 18.388
 ORDER BY pred;

-----------------------------------------------------------------------
--                               SCORE NEW DATA
-----------------------------------------------------------------------

-- Now that the model has an invalid covariance matrix, it is 
-- no longer possible to obtain confidence bounds.
-- So the lower (PL) and upper (PU) confidence bounds are NULL
set long 20000
set line 200
set pagesize 100
SELECT CUST_ID,
       PREDICTION(GLMR_SH_Regr_sample USING *) pr,
       PREDICTION_BOUNDS(GLMR_SH_Regr_sample USING *).lower pl,
       PREDICTION_BOUNDS(GLMR_SH_Regr_sample USING *).upper pu,
       PREDICTION_DETAILS(GLMR_SH_Regr_sample USING *) pd
  FROM mining_data_apply_v
 WHERE CUST_ID < 100010
 ORDER BY CUST_ID;
